# Get TFP sample (main_cleaned.csv)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd("../data")

library(tidyverse)
library(dtplyr)
library(data.table)
library(stringr)
library(lubridate)
library(haven)
library(here)
library(DescTools)

# ---------- read data ---------------

# TFP data
TFPData <- read.csv("TFPData_updated.csv")
min(TFPData$fyear)
# CRSP Compustat link table
crsp_a_ccm <- read.csv("crsp_a_ccm.csv")
min(crsp_a_ccm$fyear)
# Corp Bond and CRSP link table
link_crsp_bond <- read.csv("wrdsapps_link_crsp_bond.csv")
# Corp Bond data
wrds_bond <- read_sas("bondret.sas7bdat")
# Capital IQ rating data
ciq_ratings <- fread("ciq_ratings.csv")
# Compustat rating data
comp_ratings <- fread("comp_na_rating.csv")


# 0. create rating map ------------------

## SP Map
SPratingNUM_map <- wrds_bond %>%
  as.data.table() %>%
  select(N_SP, R_SP) %>%
  arrange(N_SP) %>%
  group_by(N_SP) %>%
  summarise(R_SP = last(R_SP)) %>%
  as.data.frame()

## WRDS map
RatingNUM_map <- wrds_bond %>%
  as.data.table() %>%
  select(RATING_NUM, RATING_CAT) %>%
  arrange(RATING_NUM) %>%
  group_by(RATING_NUM) %>%
  summarise(RATING_CAT = last(RATING_CAT)) %>%
  as.data.frame()

# 1. WRDS Bond Return Database ----------------
# Pros: 
#   - Rating across Standard and Poor’s, Moody’s, and Fitch.
# Cons: 
#   - Require to have at least one corporate bond
#   - Sample start 2002

# prepare CRSP Bond linking table
link_crsp_bond <- link_crsp_bond %>% 
  select(CUSIP, PERMNO, PERMCO, link_startdt, link_enddt) %>% 
  mutate(link_startdt = ymd(link_startdt),
         link_enddt = ymd(link_enddt))

# prepare CRSP Compustat linking table
crsp_a_ccm <- crsp_a_ccm %>% 
  select(gvkey, conm, cusip, LPERMNO, LPERMCO, LINKDT, LINKENDDT) %>% 
  mutate(LINKDT = ymd(LINKDT),
         LINKENDDT = ymd(LINKENDDT))

# get Corporate bond rating
CorpB_rating <- wrds_bond %>% 
  as.data.table() %>%
  select(DATE, CUSIP, company_symbol, R_SP, R_MR, R_FR, RATING_NUM, RATING_CAT) %>% 
  # no dup
  group_by(DATE, CUSIP) %>% 
  mutate(id = 1:n()) %>%
  filter(id == 1) %>%
  select(-id) %>%
  ungroup() %>%
  as.data.table()

# link to CRSP PERMNO
CorpB_rating <- CorpB_rating %>%
  as.data.table() %>%
  inner_join(link_crsp_bond, "CUSIP") %>% 
  filter(DATE<=link_enddt&DATE>=link_startdt) %>%
  # no dup
  group_by(DATE, CUSIP) %>% 
  mutate(id = 1:n()) %>%
  filter(id == 1) %>%
  select(-id) %>%
  ungroup() %>%
  as.data.table()

# link to Compustat GVKEY
CorpB_rating <- CorpB_rating %>% 
  as.data.table() %>%
  inner_join(crsp_a_ccm, by = c("PERMNO"="LPERMNO")) %>%
  filter(DATE>=LINKDT) %>%
  filter(DATE<=LINKENDDT|is.na(LINKENDDT)) %>%
  # no dup
  group_by(DATE, CUSIP) %>% 
  mutate(id = 1:n()) %>%
  filter(id == 1) %>%
  select(-id) %>%
  ungroup() %>%
  select(-link_startdt, - link_enddt, -LINKDT, LPERMCO, -LINKENDDT) %>%
  as.data.table()

CorpB_rating_firmyear <- CorpB_rating %>% 
  filter(!is.na(RATING_NUM)) %>%
  # for each firm-month, get the median rating across all bonds
  group_by(gvkey, DATE) %>%
  summarise(PERMNO = last(PERMNO),
            PERMCO = last(PERMCO),
            cusip = last(cusip),
            TICKER = last(company_symbol),
            conm = last(conm),
            RATING_NUM = median(RATING_NUM)) %>%
  arrange(gvkey, DATE) %>%
  ungroup() %>%
  # For each firm-year, take median over year
  mutate(year = year(DATE)) %>%
  group_by(gvkey, year) %>%
  summarise(PERMNO = last(PERMNO),
            PERMCO = last(PERMCO),
            cusip = last(cusip),
            TICKER = last(TICKER),
            conm = last(conm),
            RATING_NUM = median(RATING_NUM)) %>%
  ungroup() %>%
  # round to integral
  mutate(RATING_NUM = round(RATING_NUM)) %>%
  as.data.frame()


# 2. Compustat Rating data -------------------------------
# Pros: 
#   - over early sample
#   - easy to use
# Cons: 
#   - stop at February 2017

comp_ratings_year <- comp_ratings %>% 
  # select S&P Domestic Long Term Issuer Credit Rating
  select(gvkey, datadate, splticrm) %>%
  # not missing
  filter(splticrm != "") %>%
  mutate(datadate = ymd(datadate)) %>%
  arrange(gvkey, datadate) %>% 
  # map category to numeric rating
  inner_join(SPratingNUM_map, by = c("splticrm" = "R_SP")) %>%
  arrange(gvkey, datadate) %>%
  mutate(year = year(datadate)) %>%
  filter(!is.na(N_SP)) %>%
  # get median over year
  group_by(gvkey, year) %>%
  summarise(sp_rating = median(N_SP)) %>%
  # round
  mutate(sp_rating = round(sp_rating)) %>%
  arrange(gvkey, year) %>%
  as.data.table()


# 3. Capital IQ Rating data --------------------------------------------
# Pros: 
#   - up-to-date
# Cons: 
#   - only have observation when credit change


ciq_ratings_year <- ciq_ratings %>%
  select(gvkey, ratingdate, ratingsymbol) %>%
  mutate(ratingdate = ymd(ratingdate)) %>%
  # remove all number from rating symbol
  mutate(ratingsymbol = str_replace_all(ratingsymbol, "[:digit:]", "")) %>%
  # map with numeric rating
  inner_join(SPratingNUM_map, by = c("ratingsymbol" = "R_SP")) %>%
  arrange(gvkey, ratingdate) %>%
  filter(!is.na(N_SP)) %>%
  filter(!is.na(gvkey)) %>%
  mutate(year = year(ratingdate)) %>%
  # only keep after 2017 (piror 2017 data should be the same as compustat sample)
  filter(year >= 2017) %>%
  arrange(gvkey, year) %>%
  group_by(gvkey, year) %>%
  # take median over year
  summarise(N_SP2 = median(N_SP)) %>%
  # round
  mutate(N_SP2 = round(N_SP2)) %>%
  as.data.table()

# 4. Merge with TFP --------------------------------

Main_sample <- TFPData %>% 
  # left join with compustat sample
  left_join(comp_ratings_year, by = c("gvkey", "fyear"="year")) %>%
  # left join with capital IQ sample
  left_join(ciq_ratings_year, by = c("gvkey", "fyear"="year")) %>% 
  # left join with WRDS Bond sample
  left_join(CorpB_rating_firmyear %>% select(gvkey, year, RATING_NUM),
            by = c("gvkey", "fyear"="year")) %>%
  # use capital IQ rating if capital IQ is not missing and after 2017,
  # use compustat or keep missing otherwise
  mutate(sp_rating2 = ifelse(!is.na(N_SP2)&fyear>=2017, N_SP2, sp_rating)) %>%
  # use WRDS bond rating if avaible
  mutate(Rating_final = ifelse(!is.na(RATING_NUM), RATING_NUM, sp_rating2)) %>%
  group_by(gvkey) %>%
  # fill missing value (downward imputation)
  fill(Rating_final, .direction = "down") %>%
  ungroup() %>%
  select(gvkey, fyear, TFP, Rating_final) %>%
  # map numeric rating with category rating
  left_join(SPratingNUM_map %>% filter(!is.na(N_SP)), by = c("Rating_final" = "N_SP"))


# 5. Add other financial variables -----------

## read more data 

crsp_ME <- read_sas("CRSP_ME.sas7bdat")
Altman <- read_sas("Altman.sas7bdat")
financial_ratio <- fread("wrdsapps_finratio_ibes.csv")
comp <- read_dta("compustat_data.dta")

## 5.1 Ratios -----------
finratio_year <- financial_ratio %>%
  as.data.table() %>%
  mutate(qdate = ymd(qdate)) %>% 
  mutate(year = year(qdate),
         month = month(qdate),
         qrt = quarter(qdate)) %>% 
  filter(!is.na(year)) %>%
  group_by(gvkey, year) %>%
  summarise(permno = last(permno),
            intcov = mean(intcov, na.rm=T), 
            intcov_ratio = mean(intcov_ratio, na.rm=T)) %>%
  ungroup() %>%
  as.data.table()


## 5.2 sales -----------
comp_year <- comp %>% 
  mutate(gvkey = as.numeric(gvkey)) %>%
  filter(sale>0) %>%
  mutate(sale_emp = sale/emp) %>%
  mutate(log_sale = log(sale)) %>%
  filter(!is.na(log_sale)) %>% 
  filter(!is.na(sale_emp))

## 5.3 ME -----------
crsp_ME_year <- crsp_ME %>%
  as.data.table() %>%
  mutate(year = year(DATE)) %>%
  filter(!is.na(ME)) %>%
  group_by(PERMNO, year) %>%
  summarise(ME = mean(ME, na.rm = T)) %>%
  ungroup() %>%
  as.data.table()

## 5.4 Altman -----------
Altman_year <- Altman %>% 
  mutate(gvkey = as.numeric(gvkey)) %>% 
  mutate(year = year(datadate)) %>% 
  filter(!is.na(year)) %>% 
  group_by(gvkey, year) %>% 
  summarise(Altman_Z = mean(Altman_Z, na.rm=T),
            Altman_Z2 = mean(Altman_Z2, na.rm=T)) %>%
  ungroup() %>%
  as.data.table()

##  5.5 merge to the main sample  -----------
Main_sample <- Main_sample %>% 
  left_join(comp_year , by = c("gvkey", "fyear")) %>%
  left_join(Altman_year, by = c("gvkey", "fyear"="year")) %>%
  left_join(finratio_year, by = c("gvkey", "fyear"="year")) %>%
  left_join(crsp_ME_year, by = c("permno"="PERMNO", "fyear"="year")) %>%
  group_by(gvkey, fyear) %>%
  slice(n()) %>%
  select(gvkey, permno, fyear, everything()) %>%
  ungroup()



# 6. Winsorization -----------------------
wins <- function(x, p = c(0.01, 0.99), na.rm = TRUE) {
  q <- quantile(x, probs = p, na.rm = na.rm)
  x[x < q[1]] <- q[1]
  x[x > q[2]] <- q[2]
  return(x)
}
Main_sample <- Main_sample %>%
  mutate(TFP = exp(TFP)) %>%
  mutate(
    TFP_full = TFP,
    TFP = wins(TFP),
    intcov = wins(intcov),
    intcov_ratio = wins(intcov_ratio),
    Altman_Z_full = Altman_Z,
    Altman_Z = wins(Altman_Z),
    Altman_Z2 = wins(Altman_Z2),
    sale_emp = wins(sale_emp),
    log_sale = wins(log_sale),
    ME = wins(ME),
    sale = wins(sale),
    at = wins(at)
  )

# save  
fwrite(Main_sample, "../main_cleaned.csv")




